#library for data manipulation
import pandas as pd
import numpy as np
# for Plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Importing the data
USA = pd.read_csv('E:\\Data for Assignment\\US_Accidents_May19.csv')
This is a countrywide traffic accident dataset, which covers 49 states of the United States. The data is collected from February 2016 to March 2019, using several data providers, including two APIs which provide streaming traffic event data. These APIs broadcast traffic events captured by a variety of entities, such as the US and state departments of transportation, law enforcement agencies, traffic cameras and traffic sensors within the road-networks. Currently, there are about 2.25 million accident records in this dataset. </font>
# Finding the number of Rows and Columns:
USA.shape
# To check what headers are there:
USA.head()
fig=sns.heatmap(USA[['TMC','Severity','Start_Lat','End_Lat','Distance(mi)','Temperature(F)','Wind_Chill(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)']].corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':15})
fig=plt.gcf()
fig.set_size_inches(18,15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
# Checking for the Data types for each of the variable.
USA.info()
# To check the Count, Meanm STd Dev, Max and Min for the dataset.
USA.describe()
USA = USA.drop(["ID","Source","TMC","Start_Lat","Start_Lng","End_Lat","End_Lng","Distance(mi)","Description",
"Number","Street","Side","City","County","Zipcode","Country","Airport_Code","Weather_Timestamp","Wind_Direction","Precipitation(in)"],axis = 1)
# Reading the names of the columns after removing the columns, just to check if the defined columns are removed.
USA.columns
# Finding the number of Rows and Columns:
USA.shape
# Treading start date and time
USA['st_year'] = pd.DatetimeIndex(USA['Start_Time']).year
USA['st_Month'] = pd.DatetimeIndex(USA['Start_Time']).month
USA.head()
USA[USA.st_year != '2016']
# Checking the data points for each of the year
USA.groupby('st_year')["st_year"].count()
# Removing the all the year data points other than 2019, as the dataset size is more, we decided to go with only 2019 data points.
USA.drop(USA[USA['st_year'] == 2016].index, inplace = True)
USA.drop(USA[USA['st_year'] == 2017].index, inplace = True)
USA.drop(USA[USA['st_year'] == 2015].index, inplace = True)
USA.drop(USA[USA['st_year'] == 2018].index, inplace = True)
USA.groupby('st_year')["st_year"].count()
USA.shape
# Checking for the Null values.
USA.isna().sum()
# Replacing NaN value with mode value
USA['Timezone'].fillna(USA['Timezone'].mode()[0], inplace=True)
USA['Temperature(F)'].fillna(USA['Temperature(F)'].mode()[0], inplace=True)
USA['Wind_Chill(F)'].fillna(USA['Wind_Chill(F)'].mode()[0], inplace=True)
USA['Humidity(%)'].fillna(USA['Humidity(%)'].mode()[0], inplace=True)
USA['Pressure(in)'].fillna(USA['Pressure(in)'].mode()[0], inplace=True)
USA['Visibility(mi)'].fillna(USA['Visibility(mi)'].mode()[0], inplace=True)
USA['Wind_Speed(mph)'].fillna(USA['Wind_Speed(mph)'].mode()[0], inplace=True)
USA['Weather_Condition'].fillna(USA['Weather_Condition'].mode()[0], inplace=True)
USA['Sunrise_Sunset'].fillna(USA['Sunrise_Sunset'].mode()[0], inplace=True)
USA['Civil_Twilight'].fillna(USA['Civil_Twilight'].mode()[0], inplace=True)
USA['Nautical_Twilight'].fillna(USA['Nautical_Twilight'].mode()[0], inplace=True)
USA['Astronomical_Twilight'].fillna(USA['Astronomical_Twilight'].mode()[0], inplace=True)
USA.isna().sum()
f,ax=plt.subplots(1,2,figsize=(18,8))
USA['Severity'].value_counts().plot.pie(explode=[0,0.1,0.1,0.1,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Severity Distribution in Percentage')
ax[0].set_ylabel('No. of Accidents')
sns.countplot('Severity',data=USA,ax=ax[1],order=USA['Severity'].value_counts().index)
ax[1].set_title('Count of Accidents Severitywise')
plt.show()
fig, ax=plt.subplots(figsize=(16,7))
USA['Weather_Condition'].value_counts().sort_values(ascending=False).plot.bar(width=0.5,edgecolor='k',align='center',linewidth=2)
plt.xlabel('Weather_Condition',fontsize=20)
plt.ylabel('Number of Accidents',fontsize=20)
ax.tick_params(labelsize=20)
plt.title('No.of accidents based on Weather Condition',fontsize=25)
plt.grid()
plt.ioff()
f,ax=plt.subplots(1,2,figsize=(18,8))
USA['Timezone'].value_counts().plot.pie(explode=[0,0,0.1,0],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Accidents in Different Timezone')
sns.countplot('Timezone',data=USA,ax=ax[1],order=USA['Timezone'].value_counts().index)
ax[1].set_title('Accident Count Based on Timezone')
plt.show()
fig, ax=plt.subplots(figsize=(16,10))
USA['State'].value_counts().sort_values(ascending=False).plot.bar(width=0.5,edgecolor='k',align='center',linewidth=2)
plt.xlabel('State',fontsize=20)
plt.ylabel('No. of Accidents',fontsize=20)
ax.tick_params(labelsize=10)
plt.title('Statewise Accidents',fontsize=25)
plt.grid()
plt.ioff()
plt.style.use('ggplot')
fig, ax = plt.subplots(figsize=(15,7))
USA.groupby(['st_year','st_Month','Severity']).count()['State'].unstack().plot(ax=ax)
ax.set_xlabel('Week')
ax.set_ylabel('Number of Accidents')
USA.head()
USA = pd.get_dummies(USA, columns=['Timezone','Amenity',"Bump","Crossing","Give_Way","Junction","No_Exit","Railway","Roundabout",
"Station","Civil_Twilight","Stop","Traffic_Calming","Traffic_Signal","Turning_Loop","Sunrise_Sunset",
"Nautical_Twilight","Astronomical_Twilight"])
USA.columns
# Dropping this columns now, as there are too many classes, if we change that data into Dummies, then only the number of columns will increase.
USA = USA.drop('Weather_Condition',axis = 1)
USA = USA.drop('Civil_Twilight',axis = 1)
USA.info()
USA.head()
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
USA_Linear = USA
USA_Linear = USA_Linear.drop('Start_Time',axis = 1)
USA_Linear = USA_Linear.drop('End_Time',axis = 1)
USA_Linear = USA_Linear.drop('State',axis = 1)
USA_Linear.shape
USA_Linear.head()
USA_Linear.info()
X_train_Li, X_test_Li, Y_train_Li, Y_test_Li = train_test_split(USA_Linear.drop("Severity", axis=1), USA_Linear['Severity'], test_size = 0.2,random_state=112)
my_model = LinearRegression(normalize=True)
my_model.fit(X_train_Li, Y_train_Li)
predictions = my_model.predict(X_test_Li)
pd.DataFrame({'actual value': Y_test_Li, 'predictions':predictions}).sample(5)
my_model.score(X_test_Li, Y_test_Li)
# Checking for Coefficient and Constant Values.
my_model.coef_
my_model.intercept_
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mean_absolute_error(Y_test_Li, predictions)
mean_squared_error(Y_test_Li, predictions)
r2_score(Y_test_Li, predictions)
End of Linear Regression.
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import math
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import statsmodels.api as sm
# Assigning the USA_Linear dataset only, as there are few columns removed in the USA_Linear data.
USA_Logistic = USA_Linear
USA_Logistic.head()
USA_Logistic.info()
X_train_Lg, X_test_Lg, Y_train_Lg, Y_test_Lg = train_test_split(USA_Logistic.drop('Severity', axis=1), USA['Severity'],\
test_size=0.2, random_state=156)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logit = LogisticRegression()
logit.fit(X_train_Lg, Y_train_Lg)
predictions = logit.predict(X_test_Lg)
predictions[:5]
#Calculating Accurary Score:
logit.score(X_test_Lg, Y_test_Lg)
#Calculating Error:
1-logit.score(X_test_Lg, Y_test_Lg)
#Calculating Accuracy:
accuracy_score(Y_test_Lg, predictions)
#Calculate number of correctly classified observations.
accuracy_score(Y_test_Lg, predictions, normalize=False)
#Calculate number of incorrectly classified observations:
len(Y_test_Lg) - accuracy_score(Y_test_Lg, predictions, normalize=False)
from sklearn.metrics import log_loss
import numpy as np
#Encode predicted classes and test labels
Y_test_Lg.head()
log_loss(Y_test_Lg, predictions)
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
confusion_mat = confusion_matrix(Y_test_Lg, predictions)
confusion_df = pd.DataFrame(confusion_mat, index=['A1','A2','A3','A4'], columns=['P1','P2','P3','P4'])
confusion_df
_=sns.heatmap(confusion_df, cmap='coolwarm', annot=True)
from sklearn.metrics import precision_score, recall_score
precision_score(Y_test_Lg, predictions, average=None)
recall_score(Y_test_Lg, predictions, average=None)
from sklearn.metrics import f1_score
f1_score(Y_test_Lg, predictions,average=None)
USA_DT = USA_Linear
USA_DT.columns
# Importing the train Test package from Sklearn:
from sklearn.model_selection import train_test_split
# Executing the Train Test Split:
X_train_dt, X_test_dt, Y_train_dt, Y_test_dt = train_test_split(USA_DT.drop("Severity", axis=1), USA['Severity'], test_size=0.2, random_state=123)
# Importing Decision Tree Package
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=20)
tree.fit(X_train_dt, Y_train_dt)
predictions = tree.predict(X_test_dt)
# Checking the top 5 predictions and actual values.
predictions[:5]
Y_test_dt[:5]
# Importing accuracy score and confusion matrix packages.
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(Y_test_dt, predictions)
accuracy_score(Y_test_dt, predictions, normalize=False)
confusion_mat_dt = confusion_matrix(Y_test_dt, predictions)
confusion_df = pd.DataFrame(confusion_mat_dt, index=['A1','A2','A3','A4','A5'], columns=['P1','P2','P3','P4','P5'])
confusion_df
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(Y_test_dt, predictions))
print(classification_report(Y_test_dt, predictions))
# Viewing the Treeout using Graphviz
_=sns.heatmap(confusion_mat, cmap='coolwarm', annot=True)
from sklearn.tree import export_graphviz
import graphviz
dot_data = export_graphviz(tree, filled=True, rounded=True, feature_names=X_train_dt.columns, out_file=None)
graphviz.Source(dot_data)
import pandas as pd
from pandas import Series
tree.feature_importances_
pd.Series(tree.feature_importances_, X_train_dt.columns)
from pydotplus import graph_from_dot_data
graph = graph_from_dot_data(dot_data)
graph.write_png("Regressor_tree.png")
# Need to export the file so that it can be worked upon in the next file:
USA_Linear.to_csv("E:\\Data for Assignment\\US_Accidents_May19_Part_2.csv")
# Import Decision Tree Regressor, and GridSearchCV and fit the model to the training data.
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
# Fixing the Max depth and Features
param_grid = [{"max_depth":[3, 4, 5, None], "max_features":[2,3,4]}]
gs = GridSearchCV(estimator=DecisionTreeRegressor(random_state=123),\
param_grid = param_grid,\
cv=20)
# Fitting the Training data
gs.fit(X_train_dt, Y_train_dt)
# Print out all the hyperparameters combinations that the GridSearchCV has tried.
gs.cv_results_['params']
# Print best hyperparameters combination.
gs.best_params_
# Print the rank for all of the tried combinations.
gs.cv_results_['rank_test_score']
# Print the complete DecisionTree estimator.
gs.best_estimator_
#Predicting the data
predictions2 = gs.predict(X_test_dt)
# Calculating accuracy of Prediction
from sklearn.metrics import mean_absolute_error, r2_score
mean_absolute_error(Y_test_dt, predictions2)
r2_score(Y_test_dt, predictions2)